clear all
set more off, perm
set mem 10000000
set matsize 10000
*version 14
version 12

****************************************************************** 
*** Schools Fuzzy Merge (remaining unmatched villages) ***********
****************************************************************** 

cd "$smerge"

****************************************************************** 
****************************************************************** 

** Step 1:  Unmatched, bk_code_pca
{

use "pca_2001_names_merges_all_temp_unmatched.dta", clear
tostring  st_code dt_code bk_code*, replace
cap drop stdt
gen stdt = st_code + " " + dt_code
foreach v of varlist village* {
  replace `v' = subinstr(`v',"(","{",.)
  replace `v' = subinstr(`v',")","}",.)
}
replace village_vd = "--DUPLICATE--" if (village_vd==village_pca) & village_vd!=""
replace village11 = "--DUPLICATE--" if (village11==village_vd | village11==village_pca) & village11!=""
replace village_conc01 = "--DUPLICATE--" if (village_conc01==village11 | village_conc01==village_vd | village_conc01==village_pca) & village_conc01!=""
replace village_conc11 = "--DUPLICATE--" if (village_conc11==village_conc01 | village_conc11==village11 | village_conc11==village_vd | village_conc11==village_pca) & village_conc11!=""
replace village_vd11 = "--DUPLICATE--" if (village_vd11==village_conc11 | village_vd11==village_conc01 | village_vd11==village11 | village_vd11==village_vd | village_vd11==village_pca) & village_vd11!=""
save "temp_clean.dta", replace

use "$schools/school_census_for_merge_temp_unmatched.dta", clear
tostring  st_code dt_code bk_code*, replace
cap drop stdt
gen stdt = st_code + " " + dt_code
replace village = subinstr(village,"(","{",.)
replace village = subinstr(village,")","}",.)
save "school_census_for_merge_temp_unmatched_temp.dta", replace

*Reclink all village names

use "temp_clean.dta", clear
save "temp.dta", replace

timer clear
timer on 1
foreach vi of varlist village_pca village_vd village11 village_conc01  village_vd11 { // no village_conc11 bc all dupes
	cap drop fscore* fmerge* schv_id* Uvillage* 
	duplicates drop
	local vi_stub = subinstr(substr("`vi'",8,99),"_","",.)
	rename `vi' village
	keep if village!="--DUPLICATE--" & village!=""
	reclink stdt bk_code_pca village using "school_census_for_merge_temp_unmatched_temp.dta", idmaster(names_id) ///
		idusing(schv_id) gen(fscore_pca_`vi_stub') required(stdt bk_code_pca) _merge(fmerge_pca_`vi_stub') minscore(0.95)
	keep if fmerge_pca_`vi_stub'==3
	gen schv_id_pca_`vi_stub' = schv_id
	rename Uvillage Uvillage_pca_`vi_stub'
	keep names_id Uvillage_pca_`vi_stub' fscore_pca_`vi_stub' fmerge_pca_`vi_stub' schv_id_pca_`vi_stub' 
	duplicates drop
	duplicates drop names_id, force // in case of a few remaining ties
	merge m:m names_id using "temp.dta", nogen 
	save "temp.dta", replace
}
timer off 1
timer list


*Define a single schv_id for each row
use "temp.dta", clear
rename village11 village_11
duplicates drop
egen fscore_rowmax = rowmax(fscore_pca_*)
egen fmerge_rowmax = rowmax(fmerge_pca_*)
gen schv_id = .
gen fscore_pca = .
gen fuzzy_pca = ""
gen Uvillage = ""
gen Mvillage = ""
foreach vi of newlist _pca _vd _11 _conc01  _vd11 {
	replace schv_id = schv_id_pca`vi' if fmerge_pca`vi'==3 & schv_id==. & fscore_pca`vi'==fscore_rowmax
	replace fscore_pca = fscore_pca`vi' if fmerge_pca`vi'==3 & fscore_pca==. & fscore_pca`vi'==fscore_rowmax
	replace fuzzy_pca = substr("`vi'",2,10) if fmerge_pca`vi'==3 & fuzzy_pca=="" & fscore_pca`vi'==fscore_rowmax
	replace Uvillage = Uvillage_pca`vi' if fmerge_pca`vi'==3 & Uvillage=="" & fscore_pca`vi'==fscore_rowmax
	replace Mvillage = village`vi' if fmerge_pca`vi'==3 & Mvillage=="" & fscore_pca`vi'==fscore_rowmax
}
rename village_11 village11
assert schv_id!=. if fmerge_rowmax==3
drop fmerge_* fscore_pca_* fscore_rowmax schv_id_* Uvillage_pca*
duplicates drop
merge m:1 schv_id using "school_census_for_merge_temp_unmatched_temp.dta", keep(1 3) keepusing(block village nschools count_sch_id)
assert Uvillage==village
drop Uvillage
duplicates drop

*Remove duplicates, so a single PCA village doesn't match to multiple schools villages
duplicates t names_id, gen(dup1)
sort names_id

  // drop the few where there are just straight duplicates of both master and using identifiers
duplicates drop names_id schv_id, force

  // clean names to find identical ones among dups
foreach v of varlist Mvillage village{
  replace `v' = subinstr(`v',"{"," ",.)
  replace `v' = subinstr(`v',"}"," ",.)
  replace `v' = subinstr(`v',"["," ",.)
  replace `v' = subinstr(`v',"]"," ",.)
  replace `v' = subinstr(`v',"."," ",.)
  replace `v' = subinstr(`v',"-"," ",.)
  replace `v' = subinstr(`v',"'"," ",.)
  replace `v' = trim(itrim(`v'))  
}

forvalues iter = 1/3 {
  drop if dup1>0 & names_id==names_id[_n-1] & subinstr(Mvillage[_n-1]," ","",.)==subinstr(village[_n-1]," ","",.) & subinstr(Mvillage," ","",.)!=subinstr(village," ","",.)
  drop if dup1>0 & names_id==names_id[_n+1] & subinstr(Mvillage[_n+1]," ","",.)==subinstr(village[_n+1]," ","",.) & subinstr(Mvillage," ","",.)!=subinstr(village," ","",.)
  forvalues x = 1/4 {
    drop if dup1>0 & names_id==names_id[_n-1] & (regexm(Mvillage[_n-1],"`x'")==1 & regexm(village[_n-1],"`x'")==1) & regexm(village,"`x'")!=1
    drop if dup1>0 & names_id==names_id[_n+1] & (regexm(Mvillage[_n+1],"`x'")==1 & regexm(village[_n+1],"`x'")==1) & regexm(village,"`x'")!=1
  }
  foreach x of newlist III II I {
    drop if dup1>0 & names_id==names_id[_n-1] & (regexm(Mvillage[_n-1]," `x'")==1 & regexm(village[_n-1]," `x'")==1) & regexm(village," `x'")!=1
    drop if dup1>0 & names_id==names_id[_n+1] & (regexm(Mvillage[_n+1]," `x'")==1 & regexm(village[_n+1]," `x'")==1) & regexm(village," `x'")!=1
    drop if dup1>0 & names_id==names_id[_n-1] & (regexm(Mvillage[_n-1],"`x' ")==1 & regexm(village[_n-1],"`x' ")==1) & regexm(village,"`x' ")!=1
    drop if dup1>0 & names_id==names_id[_n+1] & (regexm(Mvillage[_n+1],"`x' ")==1 & regexm(village[_n+1],"`x' ")==1) & regexm(village,"`x' ")!=1
  }
   foreach x of newlist A B {
    drop if dup1>0 & names_id==names_id[_n-1] & (substr(subinstr(Mvillage[_n-1]," ","",.),-1,1)=="`x'" & substr(subinstr(village[_n-1]," ","",.),-1,1)=="`x'") &  substr(subinstr(village," ","",.),-1,1)!="`x'"
    drop if dup1>0 & names_id==names_id[_n+1] & (substr(subinstr(Mvillage[_n+1]," ","",.),-1,1)=="`x'" & substr(subinstr(village[_n+1]," ","",.),-1,1)=="`x'") &  substr(subinstr(village," ","",.),-1,1)!="`x'"
  }   
}

  // drop by fscore
*drop if dup1>0 & names_id==names_id[_n-1] & fscore_pca<fscore_pca[_n-1] & fscore_pca!=. & fscore_pca[_n-1]!=.
*drop if dup1>0 & names_id==names_id[_n+1] & fscore_pca<fscore_pca[_n+1] & fscore_pca!=. & fscore_pca[_n+1]!=.

  // for remaining dups, treat them like quarterbacks: if you have 2, that means you really have 0  (and reset all schools variables)
duplicates t names_id, gen(dup1a)
replace schv_id = . 	if _merge==3 & dup1a>0
replace block = "" 		if _merge==3 & dup1a>0
replace village = "" 	if _merge==3 & dup1a>0
replace nschools = . 	if _merge==3 & dup1a>0
replace count_sch_id = . if _merge==3 & dup1a>0
replace fscore_pca = .  if _merge==3 & dup1a>0
replace fuzzy_pca = ""  if _merge==3 & dup1a>0
replace Mvillage = ""   if _merge==3 & dup1a>0

duplicates drop
duplicates r names_id

*Drop duplicates and bad matches, confirm a(n almost) 1-1 match
drop dup1* _merge Mvillage 
duplicates drop
duplicates r schv_id
duplicates r names_id

*Split "names" dataset into matched and unmatched villages
keep if schv_id!=.
save "temp_matched1.dta", replace

}

****************************************************************** 
****************************************************************** 

** Step 2:  Unmatched, bk_code11
{

*Reclink all village names
timer clear
timer on 1
use "temp_clean.dta", clear
save "temp1.dta", replace

foreach vi of varlist village_pca village_vd village11 village_conc01 village_vd11 { // no village_conc11 bc all dupes
	cap drop fscore* 
	cap drop fmerge* 
	cap drop schv_id* 
	cap drop Uvillage* 
	cap drop village
	duplicates drop
	local vi_stub = subinstr(substr("`vi'",8,99),"_","",.)
	rename `vi' village
	keep if village!="--DUPLICATE--" & village!=""
	reclink stdt bk_code11 village using "school_census_for_merge_temp_unmatched_temp.dta", idmaster(names_id) ///
		idusing(schv_id) gen(fscore_11_`vi_stub') required(stdt bk_code11) _merge(fmerge_11_`vi_stub') minscore(0.95)
	rename fmerge_11_`vi_stub' fmerge_pca_`vi_stub'	
	rename fscore_11_`vi_stub' fscore_pca_`vi_stub'
	keep if fmerge_pca_`vi_stub'==3
	gen schv_id_pca_`vi_stub' = schv_id
	rename Uvillage Uvillage_pca_`vi_stub'
	keep names_id Uvillage_pca_`vi_stub' fscore_pca_`vi_stub' fmerge_pca_`vi_stub' schv_id_pca_`vi_stub' 
	duplicates drop
	duplicates drop names_id, force // in case of a few remaining ties
	merge m:m names_id using "temp1.dta", nogen 
	save "temp1.dta", replace
}
timer off 1
timer list


*Define a single schv_id for each row
use "temp1.dta", clear
rename village11 village_11
duplicates drop
egen fscore_rowmax = rowmax(fscore_pca_*)
egen fmerge_rowmax = rowmax(fmerge_pca_*)
gen schv_id = .
gen fscore_pca = .
gen fuzzy_pca = ""
gen Uvillage = ""
gen Mvillage = ""
foreach vi of newlist _pca _vd _11 _conc01  _vd11 {
	replace schv_id = schv_id_pca`vi' if fmerge_pca`vi'==3 & schv_id==. & fscore_pca`vi'==fscore_rowmax
	replace fscore_pca = fscore_pca`vi' if fmerge_pca`vi'==3 & fscore_pca==. & fscore_pca`vi'==fscore_rowmax
	replace fuzzy_pca = substr("`vi'",2,10) if fmerge_pca`vi'==3 & fuzzy_pca=="" & fscore_pca`vi'==fscore_rowmax
	replace Uvillage = Uvillage_pca`vi' if fmerge_pca`vi'==3 & Uvillage=="" & fscore_pca`vi'==fscore_rowmax
	replace Mvillage = village`vi' if fmerge_pca`vi'==3 & Mvillage=="" & fscore_pca`vi'==fscore_rowmax
}
rename village_11 village11
assert schv_id!=. if fmerge_rowmax==3
drop fmerge_* fscore_pca_* fscore_rowmax schv_id_* Uvillage_pca*
duplicates drop
merge m:1 schv_id using "school_census_for_merge_temp_unmatched_temp.dta", keep(1 3) keepusing(block village nschools count_sch_id)
assert Uvillage==village
drop Uvillage
duplicates drop

*Remove duplicates, so a single PCA village doesn't match to multiple schools villages
duplicates t names_id, gen(dup1)
sort names_id

  // drop the few where there are just straight duplicates of both master and using identifiers
duplicates drop names_id schv_id, force

  // clean names to find identical ones among dups
foreach v of varlist Mvillage village{
  replace `v' = subinstr(`v',"{"," ",.)
  replace `v' = subinstr(`v',"}"," ",.)
  replace `v' = subinstr(`v',"["," ",.)
  replace `v' = subinstr(`v',"]"," ",.)
  replace `v' = subinstr(`v',"."," ",.)
  replace `v' = subinstr(`v',"-"," ",.)
  replace `v' = subinstr(`v',"'"," ",.)
  replace `v' = trim(itrim(`v'))  
}

forvalues iter = 1/3 {
  drop if dup1>0 & names_id==names_id[_n-1] & subinstr(Mvillage[_n-1]," ","",.)==subinstr(village[_n-1]," ","",.) & subinstr(Mvillage," ","",.)!=subinstr(village," ","",.)
  drop if dup1>0 & names_id==names_id[_n+1] & subinstr(Mvillage[_n+1]," ","",.)==subinstr(village[_n+1]," ","",.) & subinstr(Mvillage," ","",.)!=subinstr(village," ","",.)
  forvalues x = 1/4 {
    drop if dup1>0 & names_id==names_id[_n-1] & (regexm(Mvillage[_n-1],"`x'")==1 & regexm(village[_n-1],"`x'")==1) & regexm(village,"`x'")!=1
    drop if dup1>0 & names_id==names_id[_n+1] & (regexm(Mvillage[_n+1],"`x'")==1 & regexm(village[_n+1],"`x'")==1) & regexm(village,"`x'")!=1
  }
  foreach x of newlist III II I {
    drop if dup1>0 & names_id==names_id[_n-1] & (regexm(Mvillage[_n-1]," `x'")==1 & regexm(village[_n-1]," `x'")==1) & regexm(village," `x'")!=1
    drop if dup1>0 & names_id==names_id[_n+1] & (regexm(Mvillage[_n+1]," `x'")==1 & regexm(village[_n+1]," `x'")==1) & regexm(village," `x'")!=1
    drop if dup1>0 & names_id==names_id[_n-1] & (regexm(Mvillage[_n-1],"`x' ")==1 & regexm(village[_n-1],"`x' ")==1) & regexm(village,"`x' ")!=1
    drop if dup1>0 & names_id==names_id[_n+1] & (regexm(Mvillage[_n+1],"`x' ")==1 & regexm(village[_n+1],"`x' ")==1) & regexm(village,"`x' ")!=1
  }
   foreach x of newlist A B {
    drop if dup1>0 & names_id==names_id[_n-1] & (substr(subinstr(Mvillage[_n-1]," ","",.),-1,1)=="`x'" & substr(subinstr(village[_n-1]," ","",.),-1,1)=="`x'") &  substr(subinstr(village," ","",.),-1,1)!="`x'"
    drop if dup1>0 & names_id==names_id[_n+1] & (substr(subinstr(Mvillage[_n+1]," ","",.),-1,1)=="`x'" & substr(subinstr(village[_n+1]," ","",.),-1,1)=="`x'") &  substr(subinstr(village," ","",.),-1,1)!="`x'"
  }   
}

  // drop by fscore
*drop if dup1>0 & names_id==names_id[_n-1] & fscore_pca<fscore_pca[_n-1] & fscore_pca!=. & fscore_pca[_n-1]!=.
*drop if dup1>0 & names_id==names_id[_n+1] & fscore_pca<fscore_pca[_n+1] & fscore_pca!=. & fscore_pca[_n+1]!=.

  // for remaining dups, treat them like quarterbacks: if you have 2, that means you really have 0  (and reset all schools variables)
duplicates t names_id, gen(dup1a)
replace schv_id = . 	if _merge==3 & dup1a>0
replace block = "" 		if _merge==3 & dup1a>0
replace village = "" 	if _merge==3 & dup1a>0
replace nschools = . 	if _merge==3 & dup1a>0
replace count_sch_id = . if _merge==3 & dup1a>0
replace fscore_pca = .  if _merge==3 & dup1a>0
replace fuzzy_pca = ""  if _merge==3 & dup1a>0
replace Mvillage = ""   if _merge==3 & dup1a>0
duplicates drop
duplicates r names_id

*Drop duplicates and bad matches, confirm a(n almost) 1-1 match
drop dup1* _merge Mvillage 
duplicates drop
duplicates r schv_id
duplicates r names_id

** rename
rename fscore_pca fscore_11
rename fuzzy_pca fuzzy_11

*Split "names" dataset into matched and unmatched villages
keep if schv_id!=.
save "temp_matched2.dta", replace

}

****************************************************************** 
****************************************************************** 

** Step 3:  Unmatched, bk_code2_vd11
{

*Reclink all village names
timer clear
timer on 1
use "temp_clean.dta", clear
save "temp2.dta", replace

foreach vi of varlist village_pca village_vd village11 village_conc01  village_vd11 { // no village_conc11 bc all dupes
	cap drop fscore* fmerge* schv_id* Uvillage* 
	cap drop village
	duplicates drop
	local vi_stub = subinstr(substr("`vi'",8,99),"_","",.)
	rename `vi' village
	keep if village!="--DUPLICATE--" & village!=""
	reclink stdt bk_code_2vd11 village using "school_census_for_merge_temp_unmatched_temp.dta", idmaster(names_id) ///
		idusing(schv_id) gen(fscore_2vd11_`vi_stub') required(stdt bk_code_2vd11) _merge(fmerge_2vd11_`vi_stub') minscore(0.95)
	rename fmerge_2vd11_`vi_stub' fmerge_pca_`vi_stub'
	rename fscore_2vd11_`vi_stub' fscore_pca_`vi_stub'
	keep if fmerge_pca_`vi_stub'==3
	gen schv_id_pca_`vi_stub' = schv_id
	rename Uvillage Uvillage_pca_`vi_stub'
	keep names_id Uvillage_pca_`vi_stub' fscore_pca_`vi_stub' fmerge_pca_`vi_stub' schv_id_pca_`vi_stub' 
	duplicates drop
	duplicates drop names_id, force // in case of a few remaining ties
	merge m:m names_id using "temp2.dta", nogen 
	save "temp2.dta", replace
}
timer off 1
timer list


*Define a single schv_id for each row
use "temp2.dta", clear
rename village11 village_11
duplicates drop
egen fscore_rowmax = rowmax(fscore_pca_*)
egen fmerge_rowmax = rowmax(fmerge_pca_*)
cap drop schv_id
gen schv_id = .
gen fscore_pca = .
gen fuzzy_pca = ""
gen Uvillage = ""
gen Mvillage = ""
foreach vi of newlist _pca _vd _11 _conc01  _vd11 {
	replace schv_id = schv_id_pca`vi' if fmerge_pca`vi'==3 & schv_id==. & fscore_pca`vi'==fscore_rowmax
	replace fscore_pca = fscore_pca`vi' if fmerge_pca`vi'==3 & fscore_pca==. & fscore_pca`vi'==fscore_rowmax
	replace fuzzy_pca = substr("`vi'",2,10) if fmerge_pca`vi'==3 & fuzzy_pca=="" & fscore_pca`vi'==fscore_rowmax
	replace Uvillage = Uvillage_pca`vi' if fmerge_pca`vi'==3 & Uvillage=="" & fscore_pca`vi'==fscore_rowmax
	replace Mvillage = village`vi' if fmerge_pca`vi'==3 & Mvillage=="" & fscore_pca`vi'==fscore_rowmax
}
rename village_11 village11
assert schv_id!=. if fmerge_rowmax==3
drop fmerge_* fscore_pca_* fscore_rowmax schv_id_* Uvillage_pca*
duplicates drop
merge m:1 schv_id using "school_census_for_merge_temp_unmatched_temp.dta", keep(1 3) keepusing(block village nschools count_sch_id)
assert Uvillage==village
drop Uvillage
duplicates drop

*Remove duplicates, so a single PCA village doesn't match to multiple schools villages
duplicates t names_id, gen(dup1)
sort names_id

  // drop the few where there are just straight duplicates of both master and using identifiers
duplicates drop names_id schv_id, force

  // clean names to find identical ones among dups
foreach v of varlist Mvillage village{
  replace `v' = subinstr(`v',"{"," ",.)
  replace `v' = subinstr(`v',"}"," ",.)
  replace `v' = subinstr(`v',"["," ",.)
  replace `v' = subinstr(`v',"]"," ",.)
  replace `v' = subinstr(`v',"."," ",.)
  replace `v' = subinstr(`v',"-"," ",.)
  replace `v' = subinstr(`v',"'"," ",.)
  replace `v' = trim(itrim(`v'))  
}

forvalues iter = 1/3 {
  drop if dup1>0 & names_id==names_id[_n-1] & subinstr(Mvillage[_n-1]," ","",.)==subinstr(village[_n-1]," ","",.) & subinstr(Mvillage," ","",.)!=subinstr(village," ","",.)
  drop if dup1>0 & names_id==names_id[_n+1] & subinstr(Mvillage[_n+1]," ","",.)==subinstr(village[_n+1]," ","",.) & subinstr(Mvillage," ","",.)!=subinstr(village," ","",.)
  forvalues x = 1/4 {
    drop if dup1>0 & names_id==names_id[_n-1] & (regexm(Mvillage[_n-1],"`x'")==1 & regexm(village[_n-1],"`x'")==1) & regexm(village,"`x'")!=1
    drop if dup1>0 & names_id==names_id[_n+1] & (regexm(Mvillage[_n+1],"`x'")==1 & regexm(village[_n+1],"`x'")==1) & regexm(village,"`x'")!=1
  }
  foreach x of newlist III II I {
    drop if dup1>0 & names_id==names_id[_n-1] & (regexm(Mvillage[_n-1]," `x'")==1 & regexm(village[_n-1]," `x'")==1) & regexm(village," `x'")!=1
    drop if dup1>0 & names_id==names_id[_n+1] & (regexm(Mvillage[_n+1]," `x'")==1 & regexm(village[_n+1]," `x'")==1) & regexm(village," `x'")!=1
    drop if dup1>0 & names_id==names_id[_n-1] & (regexm(Mvillage[_n-1],"`x' ")==1 & regexm(village[_n-1],"`x' ")==1) & regexm(village,"`x' ")!=1
    drop if dup1>0 & names_id==names_id[_n+1] & (regexm(Mvillage[_n+1],"`x' ")==1 & regexm(village[_n+1],"`x' ")==1) & regexm(village,"`x' ")!=1
  }
   foreach x of newlist A B {
    drop if dup1>0 & names_id==names_id[_n-1] & (substr(subinstr(Mvillage[_n-1]," ","",.),-1,1)=="`x'" & substr(subinstr(village[_n-1]," ","",.),-1,1)=="`x'") &  substr(subinstr(village," ","",.),-1,1)!="`x'"
    drop if dup1>0 & names_id==names_id[_n+1] & (substr(subinstr(Mvillage[_n+1]," ","",.),-1,1)=="`x'" & substr(subinstr(village[_n+1]," ","",.),-1,1)=="`x'") &  substr(subinstr(village," ","",.),-1,1)!="`x'"
  }   
}

  // drop by fscore
*drop if dup1>0 & names_id==names_id[_n-1] & fscore_pca<fscore_pca[_n-1] & fscore_pca!=. & fscore_pca[_n-1]!=.
*drop if dup1>0 & names_id==names_id[_n+1] & fscore_pca<fscore_pca[_n+1] & fscore_pca!=. & fscore_pca[_n+1]!=.

  // for remaining dups, treat them like quarterbacks: if you have 2, that means you really have 0  (and reset all schools variables)
duplicates t names_id, gen(dup1a)
replace schv_id = . 	if _merge==3 & dup1a>0
replace block = "" 		if _merge==3 & dup1a>0
replace village = "" 	if _merge==3 & dup1a>0
replace nschools = . 	if _merge==3 & dup1a>0
replace count_sch_id = . if _merge==3 & dup1a>0
replace fscore_pca = .  if _merge==3 & dup1a>0
replace fuzzy_pca = ""  if _merge==3 & dup1a>0
replace Mvillage = ""   if _merge==3 & dup1a>0
duplicates drop
duplicates r names_id

*Drop duplicates and bad matches, confirm a(n almost) 1-1 match
drop dup1* _merge Mvillage 
duplicates drop
duplicates r schv_id
duplicates r names_id

*rename
rename fscore_pca fscore_2vd11
rename fuzzy_pca fuzzy_2vd11

*Split "names" dataset into matched and unmatched villages
keep if schv_id!=.
save "temp_matched3.dta", replace

}

****************************************************************** 
****************************************************************** 

** Step 4:  Append 3 reclink matched datasets for export
{

use "temp_matched1.dta", clear
append using "temp_matched2.dta"
append using "temp_matched3.dta"
save "pca_2001_names_matched_reclink.dta", replace

}

****************************************************************** 
****************************************************************** 

** Step 5:  Masala Merge on remaining unmatched schools (with blocks!)
{ 

do "$path_code/merge/masala_merge_lp_server.do"
set more off, perm

  // NOTE: The masala_merge_lp_serverS.do program is buggy, but it works wonders. It is modified from the original version 
  // written by Paul Novosad, which he generously shared with us. Be sure to "clear all" before running masala merge, 
  // to reset locals and globals. This whole step might take 24 hours to run completely, so be patient.
  
*Prep unmatched names for masala merge
*use "$smerge/pca_2001_names_merges_all_temp_unmatched.dta", clear
use "pca_2001_names_merges_all_temp_unmatched.dta", clear
tostring  st_code dt_code bk_code* , replace
cap drop stdt
gen stdt = st_code + " " + dt_code
foreach v of varlist village* {
  replace `v' = subinstr(`v',"(","{",.)
  replace `v' = subinstr(`v',")","}",.)
}
replace village_vd = "--DUPLICATE--" if (village_vd==village_pca) & village_vd!=""
replace village11 = "--DUPLICATE--" if (village11==village_vd | village11==village_pca) & village11!=""
replace village_conc01 = "--DUPLICATE--" if (village_conc01==village11 | village_conc01==village_vd | village_conc01==village_pca) & village_conc01!=""
replace village_conc11 = "--DUPLICATE--" if (village_conc11==village_conc01 | village_conc11==village11 | village_conc11==village_vd | village_conc11==village_pca) & village_conc11!=""
replace village_vd11 = "--DUPLICATE--" if (village_vd11==village_conc11 | village_vd11==village_conc01 | village_vd11==village11 | village_vd11==village_vd | village_vd11==village_pca) & village_vd11!=""
save "temp_clean.dta", replace
save "temp.dta", replace

use "$schools/school_census_for_merge_temp_unmatched.dta", clear
tostring  st_code dt_code bk_code*, replace
cap drop stdt
gen stdt = st_code + " " + dt_code
replace village = subinstr(village,"(","{",.)
replace village = subinstr(village,")","}",.)
save "school_census_for_merge_temp_unmatched_temp.dta", replace

*Masala merges
qui use "temp.dta", clear
foreach vi of varlist village_pca village_vd village11 village_conc01 village_conc11 village_vd11 {
	qui use "temp.dta", clear
	qui local vi_stub = subinstr(substr("`vi'",8,99),"_","",.)
	qui rename `vi' village
	qui keep if village!="--DUPLICATE--" & village!=""
	masala_merge2 stdt bk_code_pca using "school_census_for_merge_temp_unmatched_temp.dta", s1(village) outfile(out_pca_`vi_stub') dist(5) quietly
	di "Masala Merge bk_code_pca `vi_stub' complete!"
	keep if _masala_merge==3
	rename village `vi'
	rename village_using villageS
	rename block blockS
	gen bk = "pca"
	gen vi = "`vi'"
	drop merge*
	save "MMout_pca_`vi_stub'.dta", replace
}


qui use "temp.dta", clear
foreach vi of varlist village_pca village_vd village11 village_conc01 village_conc11 village_vd11 {
	qui use "temp.dta", clear
	qui local vi_stub = subinstr(substr("`vi'",8,99),"_","",.)
	qui rename `vi' village
	qui keep if village!="--DUPLICATE--" & village!=""
	masala_merge2 stdt bk_code11 using "school_census_for_merge_temp_unmatched_temp.dta", s1(village) outfile(out_11_`vi_stub') dist(5) quietly
	di "Masala Merge bk_code11 `vi_stub' complete!"
	keep if _masala_merge==3
	rename village `vi'
	rename village_using villageS
	rename block blockS
	gen bk = "11"
	gen vi = "`vi'"
	drop merge*
	save "MMout_11_`vi_stub'.dta", replace
}

qui use "temp.dta", clear
foreach vi of varlist village_pca village_vd village11 village_conc01 village_conc11 village_vd11 {
	qui use "temp.dta", clear
	qui local vi_stub = subinstr(substr("`vi'",8,99),"_","",.)
	qui rename `vi' village
	qui keep if village!="--DUPLICATE--" & village!=""
	masala_merge2 stdt bk_code2_vd11 using "school_census_for_merge_temp_unmatched_temp.dta", s1(village) outfile(out_2vd11_`vi_stub') dist(5) quietly
	di "Masala Merge bk_code2_vd11 `vi_stub' complete!"
	keep if _masala_merge==3
	rename village `vi'
	rename village_using villageS
	rename block blockS
	gen bk = "2vd11"
	gen vi = "`vi'"
	drop merge*
	save "MMout_2vd11_`vi_stub'.dta", replace
}

cap erase "MMout_all_bk.dta"
foreach bk of newlist _pca _11 _2vd11 {
  foreach vi of newlist _pca _vd _11 _conc01 _conc11 _vd11 {
    clear
	cap use "MMout`bk'`vi'.dta"
	cap append using "MMout_all_bk.dta"
	cap save "MMout_all_bk.dta", replace
	cap erase "MMout`bk'`vi'.dta"
  }
}

}

****************************************************************** 
******************************************************************

** Step 6:  Masala Merge on remaining unmatched schools (without blocks!)
{ 

do "$path_code/merge/masala_merge_lp_server.do"
set more off, perm

  // NOTE: The masala_merge_lp_serverS.do program is buggy, but it works wonders. It is modified from the original version 
  // written by Paul Novosad, which he generously shared with us. Be sure to "clear all" before running masala merge, 
  // to reset locals and globals. This whole step might take 24 hours to run completely, so be patient.
  
*Prep unmatched names for masala merge
*use "pca_2001_names_merges_all_temp_unmatched.dta", clear
use "pca_2001_names_merges_all_temp_unmatched.dta", clear
tostring  st_code dt_code bk_code* , replace
cap drop stdt
gen stdt = st_code + " " + dt_code
foreach v of varlist village* {
  replace `v' = subinstr(`v',"(","{",.)
  replace `v' = subinstr(`v',")","}",.)
}
replace village_vd = "--DUPLICATE--" if (village_vd==village_pca) & village_vd!=""
replace village11 = "--DUPLICATE--" if (village11==village_vd | village11==village_pca) & village11!=""
replace village_conc01 = "--DUPLICATE--" if (village_conc01==village11 | village_conc01==village_vd | village_conc01==village_pca) & village_conc01!=""
replace village_conc11 = "--DUPLICATE--" if (village_conc11==village_conc01 | village_conc11==village11 | village_conc11==village_vd | village_conc11==village_pca) & village_conc11!=""
replace village_vd11 = "--DUPLICATE--" if (village_vd11==village_conc11 | village_vd11==village_conc01 | village_vd11==village11 | village_vd11==village_vd | village_vd11==village_pca) & village_vd11!=""
save "temp_clean.dta", replace
save "temp.dta", replace

use "$schools/school_census_for_merge_temp_unmatched.dta", clear
tostring  st_code dt_code bk_code*, replace
cap drop stdt
gen stdt = st_code + " " + dt_code
replace village = subinstr(village,"(","{",.)
replace village = subinstr(village,")","}",.)
save "school_census_for_merge_temp_unmatched_temp.dta", replace

*Masala merges
qui use "temp.dta", clear
foreach vi of varlist village_pca village_vd village11 village_conc01 village_conc11 village_vd11 {
	qui use "temp.dta", clear
	qui local vi_stub = subinstr(substr("`vi'",8,99),"_","",.)
	qui rename `vi' village
	qui keep if village!="--DUPLICATE--" & village!=""
	masala_merge2 stdt using "school_census_for_merge_temp_unmatched_temp.dta", s1(village) outfile(out_nobk_`vi_stub') dist(5) quietly
	di "Masala Merge no bk_code `vi_stub' complete!"
	keep if _masala_merge==3
	rename village `vi'
	rename village_using villageS
	rename block blockS
	gen bk = "nobk"
	gen vi = "`vi'"
	drop merge*
    save "MMout_nobk_`vi_stub'.dta", replace
}

cap erase "MMout_all_nobk.dta"
foreach bk of newlist _nobk {
  foreach vi of newlist _pca _vd _11 _conc01 _conc11 _vd11 {
    clear
	cap use "MMout`bk'`vi'.dta"
	cap append using "MMout_all_nobk.dta"
	cap save "MMout_all_nobk.dta", replace
	cap erase "MMout`bk'`vi'.dta"
  }
}

}

******************************************************************
******************************************************************
